rm(list=ls())

library(dplyr)
library(glmnet)
library(tidyr)
library(randomForest)
library(reshape2)
library(ggplot2)

mturk_data <- read.csv('both_relevant_sentiment.csv', stringsAsFactors=FALSE)
mturk_data <- mturk_data %>% rename(text=Text)

#########################
# check MTurk vs Google #
#########################

goog1 <- read.csv('sentiment_analysis_Google_4611.csv')
goog2 <- read.csv('sentiment_analysis_Google_5378.csv')
goog3 <- read.csv('sentiment_analysis_Google_2337.csv')

google_sentiment <- rbind(goog1, goog2, goog3) %>% 
  select(-c(X, doc_count)) %>% 
  filter(sentence_id == 'score' | sentence_id == 'magnitude') %>% 
  spread(key = sentence_id, value = sentiment_score) 

google_sentiment <- google_sentiment[-4123, ]

# correct IDs that were offset by 1
google_sentiment$doc_id <- as.numeric(google_sentiment$doc_id) - 1

google_sentiment <- google_sentiment %>% select(ID=doc_id, google_sentiment_score = score, google_sentiment_magnitude = magnitude, year=year, country=country) %>%
  mutate(year = as.numeric(year))

mturk_data <- mturk_data %>% select(-Majority, -country, -year) %>% inner_join(google_sentiment, by=c('Document_ID'='ID'))

#####################################
# Checking a few other dictionaries #
#####################################
library(quanteda)
library(tidytext)

# corpus setup
mturk_corpus <- corpus(mturk_data)

tok <- tokens(mturk_corpus, what = "word",
              remove_punct = TRUE,
              remove_symbols = TRUE,
              remove_numbers = TRUE,
              remove_twitter = TRUE,
              remove_url = TRUE,
              remove_hyphens = TRUE,
              verbose = TRUE, 
              include_docvars = TRUE)

tok.m <- tokens_select(tok, c("[\\d-]", "[[:punct:]]", "^.{1,2}$"), 
                       selection = "remove", 
                       valuetype="regex", verbose = TRUE)

tok.r <- tokens_tolower(tok.m)

# function to load a dictionary and merge data given dictionary name
get_dictionary_sentiment <- function(df, tokens_obj, dictionary_name){
  dictionary_obj <- get_sentiments(dictionary_name)
  data_dictionary_obj <- as.dictionary(dictionary_obj)
  tok_dictionary_obj <- tokens_lookup(tokens_obj, data_dictionary_obj, nomatch='unmatched')
  
  df[,paste(dictionary_name, 'sentiment_score', sep='_')] <- NA
  
  for(i in 1:length(tok_dictionary_obj)){
    n_pos <- sum(tok_dictionary_obj[i][[1]] == 'positive')
    n_neg <- sum(tok_dictionary_obj[i][[1]] == 'negative')
    
    df[i, paste(dictionary_name, 'sentiment_score', sep='_')] <- (n_pos - n_neg)/length(tok_dictionary_obj[i][[1]])
  }
  
  return(df)
}

# load 3 other dictionaries
mturk_data <- get_dictionary_sentiment(mturk_data, tok.r, 'bing')
mturk_data <- get_dictionary_sentiment(mturk_data, tok.r, 'nrc')
mturk_data <- get_dictionary_sentiment(mturk_data, tok.r, 'loughran')

# augment prediction models with vector representations
word2vec <- read.csv('keyword_filtered_tiles_50_max.csv') %>% select(-year, -country, -plenary_doc_id, -session, -Majority, -text)
word2vec$doc_id <- word2vec$doc_id - 1
mturk_data_max <- mturk_data %>% left_join(word2vec, by=c('Document_ID'='doc_id')) %>% select(-year, -country, -text, -Document_ID)

word2vec <- read.csv('keyword_filtered_tiles_50_mean.csv') %>% select(-year, -country, -plenary_doc_id, -session, -Majority, -text)
word2vec$doc_id <- word2vec$doc_id - 1
mturk_data_mean <- mturk_data %>% left_join(word2vec, by=c('Document_ID'='doc_id')) %>% select(-year, -country, -text, -Document_ID)

inds <- sample(1:10, nrow(mturk_data), replace = TRUE)
out <- data.frame()

#the loop below can take a while to run 
for(j in 1:20){
  print(j)
  for(i in unique(inds)){
    train <- mturk_data_mean[inds != i,]
    test <- mturk_data_mean[inds == i,]
    
    X <- model.matrix(alpha~., train)[,-1]
    X_dictionary <- model.matrix(alpha ~ google_sentiment_score + google_sentiment_magnitude + bing_sentiment_score + 
                                   nrc_sentiment_score + loughran_sentiment_score, train)[,-1]
    X_test <- model.matrix(alpha~., test)[,-1]
    X_test_dictionary <- model.matrix(alpha ~ google_sentiment_score + google_sentiment_magnitude + bing_sentiment_score + 
                                        nrc_sentiment_score + loughran_sentiment_score, test)[,-1]
    
    Y <- train$alpha
    Y_test <- test$alpha
    
    fit_null <- lm(alpha~google_sentiment_score, data=train)
    
    fit_ridge_full <- cv.glmnet(X, Y, alpha=0, nfolds=10, type.measure="deviance")
    fit_elastic_full <- cv.glmnet(X, Y, alpha=0.5, nfolds=10, type.measure="deviance")
    fit_lasso_full <- cv.glmnet(X, Y, alpha=1, nfolds=10, type.measure="deviance")
    fit_rf_full <- tuneRF(X, Y, doBest = TRUE, plot=FALSE, trace=FALSE)
    fit_lm_full <- lm(Y ~ ., data=data.frame(X))
    
    fit_ridge_dictionary <- cv.glmnet(X_dictionary, Y, alpha=0, nfolds=10, type.measure="deviance")
    fit_elastic_dictionary <- cv.glmnet(X_dictionary, Y, alpha=0.5, nfolds=10, type.measure="deviance")
    fit_lasso_dictionary <- cv.glmnet(X_dictionary, Y, alpha=1, nfolds=10, type.measure="deviance")
    fit_rf_dictionary <- tuneRF(X_dictionary, Y, doBest = TRUE, plot=FALSE, trace=FALSE)
    fit_lm_dictionary <- lm(Y ~ ., data=data.frame(X_dictionary))
    
    p_null <- predict(fit_null, test)
    
    p_rf_full <- predict(fit_rf_full, test)
    p_lm_full <- predict(fit_lm_full, test)
    p_ridge_full <- predict(fit_ridge_full, X_test)
    p_elastic_full <- predict(fit_elastic_full, X_test)
    p_lasso_full <- predict(fit_lasso_full, X_test)
    
    p_rf_dictionary <- predict(fit_rf_dictionary, test)
    p_lm_dictionary <- predict(fit_lm_dictionary, test)
    p_ridge_dictionary <- predict(fit_ridge_dictionary, X_test_dictionary)
    p_elastic_dictionary <- predict(fit_elastic_dictionary, X_test_dictionary)
    p_lasso_dictionary <- predict(fit_lasso_dictionary, X_test_dictionary)
    
    out <- rbind(out,
                 data.frame('RMSE' = sqrt(mean((p_null - test$alpha)^2)),
                            'Cor' = cor(test$alpha, p_null),
                            'Iter' = j,
                            'Data' = 'Google Only',
                            'Model' = 'Null',
                            stringsAsFactors = FALSE))
    
    out <- rbind(out,
                data.frame('RMSE' = c(sqrt(mean((p_rf_full - test$alpha)^2)),
                                      sqrt(mean((p_lm_full - test$alpha)^2)),
                                      sqrt(mean((p_ridge_full - test$alpha)^2)),
                                      sqrt(mean((p_elastic_full - test$alpha)^2)),
                                      sqrt(mean((p_lasso_full - test$alpha)^2))), 
                           'Cor' = c(cor(test$alpha, p_rf_full),
                                     cor(test$alpha, p_lm_full),
                                     cor(test$alpha, p_ridge_full),
                                     cor(test$alpha, p_elastic_full),
                                     cor(test$alpha, p_lasso_full)),
                           'Iter' = j, 
                           'Data' = 'Full',
                           'Model' = c('RF', 'LM', 'Ridge', 'Elastic', 'Lasso'),
                           stringsAsFactors=FALSE))
    
    out <- rbind(out,
                 data.frame('RMSE' = c(sqrt(mean((p_rf_dictionary - test$alpha)^2)),
                                       sqrt(mean((p_lm_dictionary - test$alpha)^2)),
                                       sqrt(mean((p_ridge_dictionary - test$alpha)^2)),
                                       sqrt(mean((p_elastic_dictionary - test$alpha)^2)),
                                       sqrt(mean((p_lasso_dictionary - test$alpha)^2))), 
                            'Cor' = c(cor(test$alpha, p_rf_dictionary),
                                      cor(test$alpha, p_lm_dictionary),
                                      cor(test$alpha, p_ridge_dictionary),
                                      cor(test$alpha, p_elastic_dictionary),
                                      cor(test$alpha, p_lasso_dictionary)),
                            'Iter' = j, 
                            'Data' = 'Dictionary Only',
                            'Model' = c('RF', 'LM', 'Ridge', 'Elastic', 'Lasso'),
                            stringsAsFactors=FALSE))
  }
}

save(out, file='model_results_new.RData')
load('model_results_new.RData')
library(ggthemes)

to_plot <- out %>% group_by(Model, Data, Iter) %>% 
  summarise(RMSE = mean(RMSE),
            Cor = mean(Cor)) %>%
  summarise(mean_rmse = mean(RMSE),
            lower_rmse = quantile(RMSE, 0.025),
            upper_rmse = quantile(RMSE, 0.975),
            mean_cor = mean(Cor),
            lower_cor = quantile(Cor, 0.025),
            upper_cor = quantile(Cor, 0.975))

baseline_rmse <- as.numeric(to_plot[to_plot$Model == 'Null','mean_rmse'])
baseline_cor <- as.numeric(to_plot[to_plot$Model == 'Null','mean_cor'])

to_plot <- to_plot %>% filter(Model != 'Null')
to_plot$Model <- factor(to_plot$Model, levels = c('LM', 'RF', 'Ridge', 'Elastic', 'Lasso'))

#Figure E1
## model_results_cor
gg_correlation <- ggplot(to_plot, aes(y=mean_cor, ymin=lower_cor, ymax=upper_cor, x=Model, color=Data)) + geom_point() + geom_errorbar(width=0.15) +
  geom_hline(aes(yintercept=baseline_cor), linetype='dashed') + 
  theme_minimal() + theme(legend.position = 'none',
                          panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                          panel.background = element_blank(), axis.line = element_line(colour = "black")) + 
  xlab(NULL) + ylab('Cross-Validated Correlation') + 
  annotate('text', x=4.5, y=0.66, label='All Dictionaries +\nEmbeddings', color='#00BFC4') + 
  annotate('text', x=4.5, y=0.605, label='All Dictionaries', color='#F8766D') + 
  annotate('text', x=4.5, y=0.533, label='Google Only', color='black')

## model_results_rmse
gg_rmse <- ggplot(to_plot, aes(y=mean_rmse, ymin=lower_rmse, ymax=upper_rmse, x=Model, color=Data)) + geom_point() + geom_errorbar(width=0.15) +
  geom_hline(aes(yintercept=baseline_rmse), linetype='dashed') + 
  theme_minimal() + theme(legend.position = 'none',
                          panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                          panel.background = element_blank(), axis.line = element_line(colour = "black")) + 
  xlab(NULL) + ylab('Cross-Validated RMSE') +
  annotate('text', x=4.5, y=0.6425, label='All Dictionaries +\nEmbeddings', color='#00BFC4') + 
  annotate('text', x=4.5, y=0.6775, label='All Dictionaries', color='#F8766D') + 
  annotate('text', x=4.5, y=0.71, label='Google Only', color='black')

library(gridExtra)
grid.arrange(gg_rmse, gg_correlation, ncol=1)
